Use Tensorboard to build and train a neural net for recognizing
The Neural net has mediocre overall performance, likely because I didn't spend that much time optimizing it, and because the problem is somewhat ill-posed, since a string could be a word in multiple different languages. However, for English and Mandarin words it did better than the guessing rate of 33%, which means learning was successful.
In [1]:
# imports
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import string
import math
import tabulate
import os
In [2]:
# Define constants for the training data
WORD_LENGTH = 20
feature_length = 26*WORD_LENGTH
languages = "english french mandarin".split()
num_of_languages = len(languages)
# Constants for saving
save_dir = '.\\nn_save\\'
# Function for converting words to vectors
# Letters are stored as a list of 26 integers, all 0 except for one, which is a 1
# E.g. a is [1, 0, 0... <25 0's>]
# E.g. z is [0, 0 ... <25 0's>, 1]
# Overall 20 letters are stored sequentially
# Punctuation and white space is ignored
def vectorize_word(word):
l_final = []
for i in range(WORD_LENGTH):
l_next = [0]*26
try:
l_next[string.ascii_lowercase.index(word[i])] = 1
except:
pass
l_final.extend(l_next)
return l_final
f_out = open(r'.\data\nn_params.txt', 'w')
f_out.write("{}\n".format(WORD_LENGTH))
f_out.write(save_dir+'\n')
f_out.write(" ".join(languages)+'\n')
f_out.close()
In [3]:
# Create training data
training_data = []
training_answers = []
for i, lang in enumerate(languages):
# Read files
f_in = open(".\data\{}.txt".format(lang))
words = [w.strip() for w in f_in.readlines()]
f_in.close()
# Vectorize words
vector_words = [vectorize_word(w) for w in words]
# Vectorize output
l = [0]*num_of_languages
l[i] = 1
vector_language = [l for w in words]
# Add to training data
training_data.extend(vector_words)
training_answers.extend(vector_language)
# Convert data to numpy array
training_data = np.array(training_data)
training_answers = np.array(training_answers)
In [4]:
# Summarize training data
print("Training data shape: {}".format(training_data.shape))
In [5]:
# Input and output variables
x = tf.placeholder(tf.float32, [None, feature_length])
y_ = tf.placeholder(tf.float32, [None, num_of_languages])
# Define the number of neurons in each layer
layer_lengths = [feature_length, 40, num_of_languages]
# Create each layer
neural_net = []
last_output = x
for i, current_layer_length in enumerate(layer_lengths[1:]):
# Define the length of the last layer
last_layer_length = layer_lengths[i]
# Create the variables for this layer
W = tf.Variable(tf.truncated_normal([last_layer_length, current_layer_length],
stddev=1 / math.sqrt(last_layer_length)))
b = tf.Variable(tf.constant(0.1, shape=[current_layer_length]))
h = tf.sigmoid(tf.matmul(last_output, W) + b)
# Store the variables for this layer
neural_net.append((W, b, h))
# Update the last output
last_output = h
# Output layer (softmax)
y = tf.nn.softmax(last_output)
# Scoring (use cross-entropy storing)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), axis=1))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
In [6]:
# Initialize variables
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
# Initialize accuracy metrics
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
accuracy_tracker = []
# Run the training
batch_size = 500
for i in range(40000):
batch_indices = np.random.randint(training_data.shape[0], size=batch_size)
batch_xs = training_data[batch_indices]
batch_ys = training_answers[batch_indices]
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
# Possibly print readout
if (i+1) % 2000 == 0:
corr_pred = sess.run(correct_prediction, feed_dict={x: training_data, y_: training_answers})
correct, total = len(corr_pred[corr_pred]), len(corr_pred)
acc = float(correct)/total
accuracy_tracker.append((i+1, acc))
print("Batch {:0>5d}- {:.4f} ({:0>5d}/{})".format(i+1, acc, correct, total))
# Plot training accuracy improvement
plt.plot(*zip(*accuracy_tracker))
plt.xlabel("Batch number")
plt.ylabel("Accuracy")
plt.title("Training Accuracy for language recognition neural net")
plt.show()
In [7]:
# Function for testing words
def get_predictions(test_words):
test_words_vectorized = np.array([vectorize_word(w) for w in test_words])
# Get predictions
test_results = sess.run(y, feed_dict={x: test_words_vectorized})
return test_results
# Function that tests words and prints them to make a nice pretty table
def predictions_table(test_words, answers=None):
# test_words is a list of strings (the words)
# Answers will tell the net if it is correct
# Should be a list where the number n of element i correspond means that test_words[i] is of language[n]
predictions = get_predictions(test_words)
table = [[w] for w in test_words] # First column of the table is the word
table = [t + ["{:.1f}".format(p*100) for p in pred] for t, pred in zip(table, predictions)] # Next column is the predictions
headers = ["Word"] + [l.title() for l in languages]
# Possibly print wrong answers
if answers is not None:
# Find the ones it answered correctly
correct = np.array([p[i] == np.max(p) for p, i in zip(predictions, answers)])
# Add an answers column to the table
for i, c in enumerate(correct):
if c:
table[i] += [""]
else:
table[i] += ["Wrong!"]
headers += ["Correct?"]
# Print the table:
print(tabulate.tabulate(table, headers=headers))
# Possibly print the accuracy
if answers is not None:
print("Accuracy: {:.2f}%".format(100.*len(correct[correct])/len(correct)))
In [8]:
# English words
english_words = "hello my dear chap let's have a bit of coffee".split()
english_words += "oh my heavens look at what this neural net can do".split()
english_words += "it looks like english words are often quite similar to french ones".split()
predictions_table(english_words, answers=[0]*len(english_words))
In [9]:
# French words
# Note the lack of accents (the vectorizer doesn't handle accents)
# Note my poor French also
french_words = "bonjour mon ami j'adore le francais. C'est une belle langue".split()
french_words += "je mange une croissant avec une baguette et du brie".split()
french_words += "ca c'est comment on fait des choses en france!".split()
predictions_table(french_words, answers=[1]*len(french_words))
In [10]:
# Mandarin Words
# Note I am typing in pinyin with no tones
mandarin_words = "xuexi zhongwen zhende hen nan".split()
mandarin_words += "wo hen xihuan pinyin yinwei bangzhu wo kanshu de bijiao rongyi".split()
mandarin_words += "sishisi jiu shi tebie nan shuochulai".split()
mandarin_words += "qilai, bu yuan zuo nuli de renmen!".split() # Gotta please the censors ;)
predictions_table(mandarin_words, answers=[2]*len(mandarin_words))
In [11]:
# Save neural net
# saver = tf.train.Saver()
# if not os.path.exists(save_dir):
# os.makedirs(save_dir)
# save_path = saver.save(sess, save_dir)
# print(save_path)
In [12]:
# Close the session
# sess.close()